{
    "cells": [
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "bcddce7b",
            "metadata": {
                "scrolled": true
            },
            "outputs": [],
            "source": [
                "# download presidio\n",
                "#!pip install presidio_analyzer presidio_anonymizer\n",
                "#!python -m spacy download en_core_web_lg\n",
                "#!pip install pandas"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "3345f1c4",
            "metadata": {},
            "source": [
                "###### Path to notebook: [https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb](https://www.github.com/microsoft/presidio/blob/main/docs/samples/python/batch_processing.ipynb)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "gothic-trademark",
            "metadata": {},
            "source": [
                "# Run Presidio on structured / semi-structured data\n",
                "\n",
                "This sample shows how Presidio could be potentially extended to handle the anonymization of a table or data frame.\n",
                "It introduces methods for the analysis and anonymization of both lists and dicts. \n",
                "\n",
                "Note: this sample input here is a Pandas DataFrame and a JSON file, but it can be used in other scenarios such as querying SQL data or using Spark DataFrames.\n"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "roman-allergy",
            "metadata": {},
            "source": [
                "### Set up imports"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 3,
            "id": "extensive-greensboro",
            "metadata": {},
            "outputs": [],
            "source": [
                "from typing import List, Optional, Dict, Union, Iterator, Iterable\n",
                "import collections\n",
                "from dataclasses import dataclass\n",
                "import pprint\n",
                "\n",
                "import pandas as pd\n",
                "\n",
                "from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult, DictAnalyzerResult\n",
                "from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine\n",
                "from presidio_anonymizer.entities import EngineResult\n"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "fiscal-affair",
            "metadata": {},
            "source": [
                "## Example using sample tabular data"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 4,
            "id": "bright-maple",
            "metadata": {},
            "outputs": [],
            "source": [
                "columns = [\"name phrase\", \"phone number phrase\", \"integer\", \"boolean\" ]\n",
                "sample_data = [\n",
                "        ('Charlie likes this', 'Please call 212-555-1234 after 2pm', 1, True),\n",
                "        ('You should talk to Mike', 'his number is 978-428-7111', 2, False),\n",
                "        ('Mary had a little startup', 'Phone number: 202-342-1234', 3, False)\n",
                "]"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 5,
            "id": "russian-proceeding",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": [
                            "<div>\n",
                            "<style scoped>\n",
                            "    .dataframe tbody tr th:only-of-type {\n",
                            "        vertical-align: middle;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe tbody tr th {\n",
                            "        vertical-align: top;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe thead th {\n",
                            "        text-align: right;\n",
                            "    }\n",
                            "</style>\n",
                            "<table border=\"1\" class=\"dataframe\">\n",
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>name phrase</th>\n",
                            "      <th>phone number phrase</th>\n",
                            "      <th>integer</th>\n",
                            "      <th>boolean</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
                            "      <th>0</th>\n",
                            "      <td>Charlie likes this</td>\n",
                            "      <td>Please call 212-555-1234 after 2pm</td>\n",
                            "      <td>1</td>\n",
                            "      <td>True</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>1</th>\n",
                            "      <td>You should talk to Mike</td>\n",
                            "      <td>his number is 978-428-7111</td>\n",
                            "      <td>2</td>\n",
                            "      <td>False</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2</th>\n",
                            "      <td>Mary had a little startup</td>\n",
                            "      <td>Phone number: 202-342-1234</td>\n",
                            "      <td>3</td>\n",
                            "      <td>False</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "                 name phrase                 phone number phrase  integer  \\\n",
                            "0         Charlie likes this  Please call 212-555-1234 after 2pm        1   \n",
                            "1    You should talk to Mike          his number is 978-428-7111        2   \n",
                            "2  Mary had a little startup          Phone number: 202-342-1234        3   \n",
                            "\n",
                            "   boolean  \n",
                            "0     True  \n",
                            "1    False  \n",
                            "2    False  "
                        ]
                    },
                    "execution_count": 5,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "# Create Pandas DataFrame\n",
                "df  = pd.DataFrame(sample_data,columns=columns)\n",
                "\n",
                "df"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 6,
            "id": "brazilian-punch",
            "metadata": {},
            "outputs": [],
            "source": [
                "# DataFrame to dict\n",
                "df_dict = df.to_dict(orient=\"list\")"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 7,
            "id": "fixed-commerce",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'boolean': [True, False, False],\n",
                        " 'integer': [1, 2, 3],\n",
                        " 'name phrase': ['Charlie likes this',\n",
                        "                 'You should talk to Mike',\n",
                        "                 'Mary had a little startup'],\n",
                        " 'phone number phrase': ['Please call 212-555-1234 after 2pm',\n",
                        "                         'his number is 978-428-7111',\n",
                        "                         'Phone number: 202-342-1234']}\n"
                    ]
                }
            ],
            "source": [
                "pprint.pprint(df_dict)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 8,
            "id": "verified-spirituality",
            "metadata": {},
            "outputs": [],
            "source": [
                "analyzer = AnalyzerEngine()\n",
                "batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)\n",
                "batch_anonymizer = BatchAnonymizerEngine()"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 9,
            "id": "narrative-freeze",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/plain": [
                            "[DictAnalyzerResult(key='name phrase', value=['Charlie likes this', 'You should talk to Mike', 'Mary had a little startup'], recognizer_results=[[type: PERSON, start: 0, end: 7, score: 0.85], [type: PERSON, start: 19, end: 23, score: 0.85], [type: PERSON, start: 0, end: 4, score: 0.85]]),\n",
                            " DictAnalyzerResult(key='phone number phrase', value=['Please call 212-555-1234 after 2pm', 'his number is 978-428-7111', 'Phone number: 202-342-1234'], recognizer_results=[[type: DATE_TIME, start: 31, end: 34, score: 0.85, type: PHONE_NUMBER, start: 12, end: 24, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75], [type: PHONE_NUMBER, start: 14, end: 26, score: 0.75]]),\n",
                            " DictAnalyzerResult(key='integer', value=[1, 2, 3], recognizer_results=[[], [], []]),\n",
                            " DictAnalyzerResult(key='boolean', value=[True, False, False], recognizer_results=[[], [], []])]"
                        ]
                    },
                    "execution_count": 9,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "analyzer_results = batch_analyzer.analyze_dict(df_dict, language=\"en\")\n",
                "analyzer_results = list(analyzer_results)\n",
                "analyzer_results"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 10,
            "id": "rural-month",
            "metadata": {},
            "outputs": [],
            "source": [
                "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 11,
            "id": "acute-mauritius",
            "metadata": {},
            "outputs": [],
            "source": [
                "scrubbed_df = pd.DataFrame(anonymizer_results)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 12,
            "id": "irish-phoenix",
            "metadata": {},
            "outputs": [
                {
                    "data": {
                        "text/html": [
                            "<div>\n",
                            "<style scoped>\n",
                            "    .dataframe tbody tr th:only-of-type {\n",
                            "        vertical-align: middle;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe tbody tr th {\n",
                            "        vertical-align: top;\n",
                            "    }\n",
                            "\n",
                            "    .dataframe thead th {\n",
                            "        text-align: right;\n",
                            "    }\n",
                            "</style>\n",
                            "<table border=\"1\" class=\"dataframe\">\n",
                            "  <thead>\n",
                            "    <tr style=\"text-align: right;\">\n",
                            "      <th></th>\n",
                            "      <th>name phrase</th>\n",
                            "      <th>phone number phrase</th>\n",
                            "      <th>integer</th>\n",
                            "      <th>boolean</th>\n",
                            "    </tr>\n",
                            "  </thead>\n",
                            "  <tbody>\n",
                            "    <tr>\n",
                            "      <th>0</th>\n",
                            "      <td>&lt;PERSON&gt; likes this</td>\n",
                            "      <td>Please call &lt;PHONE_NUMBER&gt; after &lt;DATE_TIME&gt;</td>\n",
                            "      <td>1</td>\n",
                            "      <td>True</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>1</th>\n",
                            "      <td>You should talk to &lt;PERSON&gt;</td>\n",
                            "      <td>his number is &lt;PHONE_NUMBER&gt;</td>\n",
                            "      <td>2</td>\n",
                            "      <td>False</td>\n",
                            "    </tr>\n",
                            "    <tr>\n",
                            "      <th>2</th>\n",
                            "      <td>&lt;PERSON&gt; had a little startup</td>\n",
                            "      <td>Phone number: &lt;PHONE_NUMBER&gt;</td>\n",
                            "      <td>3</td>\n",
                            "      <td>False</td>\n",
                            "    </tr>\n",
                            "  </tbody>\n",
                            "</table>\n",
                            "</div>"
                        ],
                        "text/plain": [
                            "                     name phrase  \\\n",
                            "0            <PERSON> likes this   \n",
                            "1    You should talk to <PERSON>   \n",
                            "2  <PERSON> had a little startup   \n",
                            "\n",
                            "                            phone number phrase integer boolean  \n",
                            "0  Please call <PHONE_NUMBER> after <DATE_TIME>       1    True  \n",
                            "1                  his number is <PHONE_NUMBER>       2   False  \n",
                            "2                  Phone number: <PHONE_NUMBER>       3   False  "
                        ]
                    },
                    "execution_count": 12,
                    "metadata": {},
                    "output_type": "execute_result"
                }
            ],
            "source": [
                "scrubbed_df"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "1cb4b006",
            "metadata": {},
            "source": [
                "## Example using JSON"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 13,
            "id": "1063019b",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n",
                        " 'key_b': {'www.abc.com'},\n",
                        " 'key_c': 3,\n",
                        " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n"
                    ]
                }
            ],
            "source": [
                "nested_dict = {\n",
                "    \"key_a\": {\"key_a1\": \"My phone number is 212-121-1424\"},\n",
                "    \"key_b\": {\"www.abc.com\"},\n",
                "    \"key_c\": 3,\n",
                "    \"names\": [\"James Bond\", \"Clark Kent\", \"Hakeem Olajuwon\", \"No name here!\"]\n",
                "}\n",
                "\n",
                "pprint.pprint(nested_dict)"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 14,
            "id": "e3c09b4b",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'key_a': {'key_a1': 'My phone number is <PHONE_NUMBER>'},\n",
                        " 'key_b': ['<URL>'],\n",
                        " 'key_c': 3,\n",
                        " 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}\n"
                    ]
                }
            ],
            "source": [
                "# Analyze dict\n",
                "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\")\n",
                "\n",
                "# Anonymize dict\n",
                "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n",
                "pprint.pprint(anonymizer_results)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "e593eb11",
            "metadata": {},
            "source": [
                "### Ignoring specific keys"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 15,
            "id": "84b2ef95",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n",
                        " 'key_b': ['<URL>'],\n",
                        " 'key_c': 3,\n",
                        " 'names': ['James Bond', 'Clark Kent', 'Hakeem Olajuwon', 'No name here!']}\n"
                    ]
                }
            ],
            "source": [
                "keys_to_skip=[\"key_a1\", \"names\"]\n",
                "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n",
                "\n",
                "# Anonymize dict\n",
                "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n",
                "pprint.pprint(anonymizer_results)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "bd0cde2a",
            "metadata": {},
            "source": [
                "### Ignoring nested keys"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": 16,
            "id": "93ed8769",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "{'key_a': {'key_a1': 'My phone number is 212-121-1424'},\n",
                        " 'key_b': ['<URL>'],\n",
                        " 'key_c': 3,\n",
                        " 'names': ['<PERSON>', '<PERSON>', '<PERSON>', 'No name here!']}\n"
                    ]
                }
            ],
            "source": [
                "keys_to_skip = [\"key_a.key_a1\"]\n",
                "\n",
                "analyzer_results = batch_analyzer.analyze_dict(input_dict = nested_dict, language=\"en\", keys_to_skip=keys_to_skip)\n",
                "\n",
                "# Anonymize dict\n",
                "anonymizer_results = batch_anonymizer.anonymize_dict(analyzer_results = analyzer_results)\n",
                "pprint.pprint(anonymizer_results)"
            ]
        },
        {
            "cell_type": "markdown",
            "id": "aa0ab530",
            "metadata": {},
            "source": [
                "#### **Note!**\n",
                "\n",
                "JSON files with objects within lists, e.g.:\n",
                "```\n",
                "{\n",
                "  \"key\": [\n",
                "    {\n",
                "      \"key2\": \"Peter Parker\"\n",
                "    },\n",
                "    {\n",
                "      \"key3\": \"555-1234\"\n",
                "    }\n",
                "  ]\n",
                "}\n",
                "```\n",
                "\n",
                "Are not yet supported. Consider breaking the JSON to parts if needed."
            ]
        },
        {
            "cell_type": "markdown",
            "id": "c708ff56",
            "metadata": {},
            "source": [
                "## Multiprocessing\n",
                "\n",
                "`BatchAnalyzerEngine` builds upon spaCy's pipelines. For more info about multiprocessing, see https://spacy.io/usage/processing-pipelines#multiprocessing.\n",
                "\n",
                "In Presidio, one can pass the `n_process` argument and the `batch_size` parameter to define how processing is done in parallel."
            ]
        },
        {
            "cell_type": "markdown",
            "id": "81316c6c",
            "metadata": {},
            "source": []
        },
        {
            "cell_type": "code",
            "execution_count": 25,
            "id": "09a80e87",
            "metadata": {},
            "outputs": [
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "[Monitor] Active Python processes: 4 - [38773, 38774, 45860, 109966]\n"
                    ]
                },
                {
                    "name": "stdout",
                    "output_type": "stream",
                    "text": [
                        "[Monitor] Active Python processes: 8 - [38773, 38774, 45860, 109966, 109973, 109976, 109977, 109978]\n",
                        "[Monitor] Active Python processes: 8 - [38773, 38774, 45860, 109966, 109973, 109976, 109977, 109978]\n",
                        "[Monitor] Active Python processes: 8 - [38773, 38774, 45860, 109966, 109973, 109976, 109977, 109978]\n",
                        "[Monitor] Active Python processes: 8 - [38773, 38774, 45860, 109966, 109973, 109976, 109977, 109978]\n",
                        "[Monitor] Active Python processes: 4 - [38773, 38774, 45860, 109966]\n"
                    ]
                }
            ],
            "source": [
                "import multiprocessing\n",
                "import psutil\n",
                "import time\n",
                "\n",
                "def analyze_batch_multiprocess(n_process=12, batch_size=4):\n",
                "    \"\"\"Run BatchAnalyzer with `n_process` processes and batch size of `batch_size`.\"\"\"\n",
                "    list_of_texts = [\"My name is mike\"]*1000\n",
                "\n",
                "    results = batch_analyzer.analyze_iterator(\n",
                "            texts=list_of_texts, \n",
                "            language=\"en\",\n",
                "            n_process=n_process, \n",
                "            batch_size=batch_size\n",
                "        )\n",
                "\n",
                "    return list(results)\n",
                "\n",
                "\n",
                "\n",
                "def monitor_processes():\n",
                "    \"\"\"Monitor all Python processes dynamically.\"\"\"\n",
                "    while True:\n",
                "        processes = [p for p in psutil.process_iter(attrs=['pid', 'name']) if \"python\" in p.info['name']]\n",
                "        print(f\"[Monitor] Active Python processes: {len(processes)} - {[p.info['pid'] for p in processes]}\")\n",
                "        time.sleep(1)\n",
                "\n",
                "\n",
                "# Run interactive monitoring\n",
                "monitor_proc = multiprocessing.Process(target=monitor_processes, daemon=True)\n",
                "monitor_proc.start()\n",
                "\n",
                "# Run the batch analyzer process\n",
                "analyze_batch_multiprocess(n_process=4, batch_size=2)\n",
                "\n",
                "# Wait for everything to conclude\n",
                "time.sleep(1)  \n",
                "\n",
                "# Clean up (not needed if daemon=True, but useful if stopping manually)\n",
                "monitor_proc.terminate()\n"
            ]
        },
        {
            "cell_type": "code",
            "execution_count": null,
            "id": "7b7b6c64",
            "metadata": {},
            "outputs": [],
            "source": []
        }
    ],
    "metadata": {
        "kernelspec": {
            "display_name": "presidio-analyzer-sAyh6tzK-py3.12",
            "language": "python",
            "name": "python3"
        },
        "language_info": {
            "codemirror_mode": {
                "name": "ipython",
                "version": 3
            },
            "file_extension": ".py",
            "mimetype": "text/x-python",
            "name": "python",
            "nbconvert_exporter": "python",
            "pygments_lexer": "ipython3",
            "version": "3.12.1"
        }
    },
    "nbformat": 4,
    "nbformat_minor": 5
}
